Note: Some of the figures in this document use grid.arrange or ggarrange. The heights/widths arguments in grid.arrange() and ggarrange() may have to be tweaked to get the proportions of the panels to look less lopsided.
libs <- c('DESeq2', 'edgeR', 'limma', 'tidyverse', 'combinat', 'ggrepel', 'org.Hs.eg.db', 'AnnotationDbi', 'ggpubr', 'fgsea', 'ggbeeswarm', 'MESS', 'pbmcapply')
lapply(libs, library, character.only=TRUE)## [[1]]
## [1] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [4] "BiocParallel" "matrixStats" "Biobase"
## [7] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [10] "S4Vectors" "BiocGenerics" "parallel"
## [13] "stats4" "stats" "graphics"
## [16] "grDevices" "utils" "datasets"
## [19] "methods" "base"
##
## [[2]]
## [1] "edgeR" "limma" "DESeq2"
## [4] "SummarizedExperiment" "DelayedArray" "BiocParallel"
## [7] "matrixStats" "Biobase" "GenomicRanges"
## [10] "GenomeInfoDb" "IRanges" "S4Vectors"
## [13] "BiocGenerics" "parallel" "stats4"
## [16] "stats" "graphics" "grDevices"
## [19] "utils" "datasets" "methods"
## [22] "base"
##
## [[3]]
## [1] "edgeR" "limma" "DESeq2"
## [4] "SummarizedExperiment" "DelayedArray" "BiocParallel"
## [7] "matrixStats" "Biobase" "GenomicRanges"
## [10] "GenomeInfoDb" "IRanges" "S4Vectors"
## [13] "BiocGenerics" "parallel" "stats4"
## [16] "stats" "graphics" "grDevices"
## [19] "utils" "datasets" "methods"
## [22] "base"
##
## [[4]]
## [1] "forcats" "stringr" "dplyr"
## [4] "purrr" "readr" "tidyr"
## [7] "tibble" "ggplot2" "tidyverse"
## [10] "edgeR" "limma" "DESeq2"
## [13] "SummarizedExperiment" "DelayedArray" "BiocParallel"
## [16] "matrixStats" "Biobase" "GenomicRanges"
## [19] "GenomeInfoDb" "IRanges" "S4Vectors"
## [22] "BiocGenerics" "parallel" "stats4"
## [25] "stats" "graphics" "grDevices"
## [28] "utils" "datasets" "methods"
## [31] "base"
##
## [[5]]
## [1] "combinat" "forcats" "stringr"
## [4] "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2"
## [10] "tidyverse" "edgeR" "limma"
## [13] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [16] "BiocParallel" "matrixStats" "Biobase"
## [19] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [22] "S4Vectors" "BiocGenerics" "parallel"
## [25] "stats4" "stats" "graphics"
## [28] "grDevices" "utils" "datasets"
## [31] "methods" "base"
##
## [[6]]
## [1] "ggrepel" "combinat" "forcats"
## [4] "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble"
## [10] "ggplot2" "tidyverse" "edgeR"
## [13] "limma" "DESeq2" "SummarizedExperiment"
## [16] "DelayedArray" "BiocParallel" "matrixStats"
## [19] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [22] "IRanges" "S4Vectors" "BiocGenerics"
## [25] "parallel" "stats4" "stats"
## [28] "graphics" "grDevices" "utils"
## [31] "datasets" "methods" "base"
##
## [[7]]
## [1] "org.Hs.eg.db" "AnnotationDbi" "ggrepel"
## [4] "combinat" "forcats" "stringr"
## [7] "dplyr" "purrr" "readr"
## [10] "tidyr" "tibble" "ggplot2"
## [13] "tidyverse" "edgeR" "limma"
## [16] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [19] "BiocParallel" "matrixStats" "Biobase"
## [22] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [25] "S4Vectors" "BiocGenerics" "parallel"
## [28] "stats4" "stats" "graphics"
## [31] "grDevices" "utils" "datasets"
## [34] "methods" "base"
##
## [[8]]
## [1] "org.Hs.eg.db" "AnnotationDbi" "ggrepel"
## [4] "combinat" "forcats" "stringr"
## [7] "dplyr" "purrr" "readr"
## [10] "tidyr" "tibble" "ggplot2"
## [13] "tidyverse" "edgeR" "limma"
## [16] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [19] "BiocParallel" "matrixStats" "Biobase"
## [22] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [25] "S4Vectors" "BiocGenerics" "parallel"
## [28] "stats4" "stats" "graphics"
## [31] "grDevices" "utils" "datasets"
## [34] "methods" "base"
##
## [[9]]
## [1] "ggpubr" "magrittr" "org.Hs.eg.db"
## [4] "AnnotationDbi" "ggrepel" "combinat"
## [7] "forcats" "stringr" "dplyr"
## [10] "purrr" "readr" "tidyr"
## [13] "tibble" "ggplot2" "tidyverse"
## [16] "edgeR" "limma" "DESeq2"
## [19] "SummarizedExperiment" "DelayedArray" "BiocParallel"
## [22] "matrixStats" "Biobase" "GenomicRanges"
## [25] "GenomeInfoDb" "IRanges" "S4Vectors"
## [28] "BiocGenerics" "parallel" "stats4"
## [31] "stats" "graphics" "grDevices"
## [34] "utils" "datasets" "methods"
## [37] "base"
##
## [[10]]
## [1] "fgsea" "Rcpp" "ggpubr"
## [4] "magrittr" "org.Hs.eg.db" "AnnotationDbi"
## [7] "ggrepel" "combinat" "forcats"
## [10] "stringr" "dplyr" "purrr"
## [13] "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "edgeR"
## [19] "limma" "DESeq2" "SummarizedExperiment"
## [22] "DelayedArray" "BiocParallel" "matrixStats"
## [25] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [28] "IRanges" "S4Vectors" "BiocGenerics"
## [31] "parallel" "stats4" "stats"
## [34] "graphics" "grDevices" "utils"
## [37] "datasets" "methods" "base"
##
## [[11]]
## [1] "ggbeeswarm" "fgsea" "Rcpp"
## [4] "ggpubr" "magrittr" "org.Hs.eg.db"
## [7] "AnnotationDbi" "ggrepel" "combinat"
## [10] "forcats" "stringr" "dplyr"
## [13] "purrr" "readr" "tidyr"
## [16] "tibble" "ggplot2" "tidyverse"
## [19] "edgeR" "limma" "DESeq2"
## [22] "SummarizedExperiment" "DelayedArray" "BiocParallel"
## [25] "matrixStats" "Biobase" "GenomicRanges"
## [28] "GenomeInfoDb" "IRanges" "S4Vectors"
## [31] "BiocGenerics" "parallel" "stats4"
## [34] "stats" "graphics" "grDevices"
## [37] "utils" "datasets" "methods"
## [40] "base"
##
## [[12]]
## [1] "MESS" "geeM" "Matrix"
## [4] "geepack" "ggbeeswarm" "fgsea"
## [7] "Rcpp" "ggpubr" "magrittr"
## [10] "org.Hs.eg.db" "AnnotationDbi" "ggrepel"
## [13] "combinat" "forcats" "stringr"
## [16] "dplyr" "purrr" "readr"
## [19] "tidyr" "tibble" "ggplot2"
## [22] "tidyverse" "edgeR" "limma"
## [25] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [28] "BiocParallel" "matrixStats" "Biobase"
## [31] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [34] "S4Vectors" "BiocGenerics" "parallel"
## [37] "stats4" "stats" "graphics"
## [40] "grDevices" "utils" "datasets"
## [43] "methods" "base"
##
## [[13]]
## [1] "pbmcapply" "MESS" "geeM"
## [4] "Matrix" "geepack" "ggbeeswarm"
## [7] "fgsea" "Rcpp" "ggpubr"
## [10] "magrittr" "org.Hs.eg.db" "AnnotationDbi"
## [13] "ggrepel" "combinat" "forcats"
## [16] "stringr" "dplyr" "purrr"
## [19] "readr" "tidyr" "tibble"
## [22] "ggplot2" "tidyverse" "edgeR"
## [25] "limma" "DESeq2" "SummarizedExperiment"
## [28] "DelayedArray" "BiocParallel" "matrixStats"
## [31] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [34] "IRanges" "S4Vectors" "BiocGenerics"
## [37] "parallel" "stats4" "stats"
## [40] "graphics" "grDevices" "utils"
## [43] "datasets" "methods" "base"
rm(libs)
# set up themes for ggplotting
theme <- theme(legend.position = 'bottom',
legend.justification = 'left',
text = element_text(size=12),
axis.text.y = element_text(size=18, hjust = 1, face='bold'),
axis.text.x = element_text(size=12, hjust = 1, face='bold'),
axis.title = element_text(size=14, face = "bold"),
legend.text= element_text(size=12),
legend.title= element_text(size=14, face='bold'),
strip.text=element_text(size=16, face='bold'),
plot.title=element_text(size=18, face='bold'))
# load inputs
load("../data/001_DEGsAndPathwayAnalysis_inputs.rdata")
load("../data/tcga_exprs_integration.rdata")
cbpal <- c("#CC79A7", # purple, primary
"#E69F00", # yellow, A
"#56B4E9", # blue, B
"#009E73", # green, C
"#7f7f7f") #gray, tcga
source("functions.r")# create design matrix using all permunations of sample codes.
# 1 and 0 corresponds to which side of the comparison the sample in the sample_names column is.
# the sample names will just correspond to the rownames of the phenotype table. this gets combined
# in the for-loop below.
d1 <- ((combinat::permn(as.character(c('1', '1', '0', '0')))))
d1 <- t( unique( data.frame( matrix(unlist(d1), nrow = factorial(4), byrow = T))))
class(d1) <- "numeric"
d2 <- ((combinat::permn(as.character(c('1', '1', '1', '0')))))
d2 <- t( unique( data.frame( matrix(unlist(d2), nrow = factorial(4), byrow = T))))
class(d2) <- "numeric"
design <- cbind(d1, d2)
colnames(design) <- NULL
rownames(design) <- NULL
# clean up intermediaries
rm(d1, d2)comparison_list <- NULL # store the comparisons here.
deg_list <- NULL # store the results of the comparison here.
# loop through the combinations in the design matrix to do permuted DEG tests in pooled
# comparisons using DESeq2.
for (i in 1:ncol(design)){
sample_code <- design[, i]
ph <- data.frame(sample_name=rownames(case.phenotype), sample_code=sample_code)
dds <- DESeqDataSetFromMatrix(countData = gcounts, colData = ph, design = ~sample_code)
dds <- DESeq(dds)
res <- results(dds)
resOrdered <- as.data.frame(res[order(res$padj),])
resOrdered$ensid <- rownames(resOrdered)
deg_list[[i]] <- resOrdered
comparison_list[[i]] <- ph
}
# clean up intermediaries
rm(resOrdered, dds, res, ph, sample_code)# now take the comparison list into something easier to coerce into a sample v sample string
# for the volcano plot labels
comparison_names <- NULL
for (i in 1:length(comparison_list)){
comparison <- comparison_list[[i]]
s0 <- paste(as.character(comparison[comparison$sample_code == 0,]$sample_name), collapse = ', ')
s1 <- paste(as.character(comparison[comparison$sample_code == 1,]$sample_name), collapse=', ')
deg_list[[i]]$sample_comparison <- as.character(paste(s0, s1, sep=' - vs - '))
comparison_names[i] <- as.character(paste(s0, s1, sep=' - vs - '))
}
names(deg_list) <- comparison_names
rm(s0, s1)
# get top 25 p values and top/bottom logFC values
top_genes <- lapply(deg_list, function(x){
x <- x %>% filter(!is.na(padj)) %>%
filter(padj < 0.5) %>%
left_join(genes, by=c('ensid'))
n <- 25
x1 <- x %>% top_n(n, wt=as.numeric(-log(padj)))
x2 <- x %>% filter(log2FoldChange > 1) %>% top_n(n, wt=log2FoldChange)
x3 <- x %>% filter(log2FoldChange < -1) %>% top_n(n, wt=abs(log2FoldChange))
x <- unique(rbind(x1, x2, x3))
rm(x1, x2, x3)
return(x)
})
top_genes <- do.call(rbind, top_genes)# coerce results to data frame and remove mirrored comparisons
mirrored_comparisons <- c("C, P - vs - A, B", "B, C - vs - A, P", "B, P - vs - A, C")
deg_df <- do.call(rbind, deg_list)
deg_df <- deg_df %>% as_tibble() %>%
filter(!is.na(padj)) %>%
filter(!sample_comparison %in% mirrored_comparisons) %>%
left_join(genes, by=c('ensid'))
top_genes <- top_genes %>%
filter(!is.na(padj)) %>%
filter(!sample_comparison %in% mirrored_comparisons)
summary(as.factor(top_genes$sample_comparison))## A - vs - B, C, P A, B - vs - C, P A, C - vs - B, P A, P - vs - B, C
## 1 71 29 1
## B - vs - A, C, P C - vs - A, B, P P - vs - A, B, C
## 62 71 68
volcano_plot_filt <- ggplot(data=deg_df[! deg_df$sample_comparison %in% c('A - vs - B, C, P', 'A, P - vs - B, C'),]) + aes(x=log2FoldChange, y=-log(padj), color=biotype) +
geom_point() +
geom_label_repel(data=top_genes[! top_genes$sample_comparison %in% c('A - vs - B, C, P', 'A, P - vs - B, C'),], aes(label=gene), show.legend=FALSE, force=3, size=3) +
facet_wrap(~ sample_comparison, scales='free',nrow=2) +
theme_classic() + theme +
labs(title='Volcano Plot: DEGs in Pooled Sample Comparisons')
volcano_plot_filtFirst coerce the deg results from DEseq2 into a list of ranks for each comparison using the method described here: https://stephenturner.github.io/deseq-to-fgsea/
Then, we will use fgsea to perform gene set enrichment on all the MSigDB oncogenic signatures.
deg_ranks <- lapply(deg_list, function(x){
x %>%
left_join(genes, by=c('ensid')) %>%
na.omit() %>%
distinct() %>%
group_by(gene) %>%
summarise(stat=mean(stat)) %>%
deframe()
} )
deg_ranks <- deg_ranks[! names(deg_ranks) %in% mirrored_comparisons]
deg_ranks <- deg_ranks[! names(deg_ranks) %in% c("A - vs - B, C, P","A, P - vs - B, C")]
hallmark_gsigs <- as.list(qusage::read.gmt('../data/h.all.v6.2.symbols.gmt'))
kegg_gsigs <- as.list(qusage::read.gmt('../data/c2.cp.kegg.v6.2.symbols.gmt'))
biocarta_gsigs <- as.list(qusage::read.gmt('../data/c2.cp.biocarta.v6.2.symbols.gmt'))
reactome_gsigs <- as.list(qusage::read.gmt('../data/c2.cp.reactome.v6.2.symbols.gmt'))Generate the enrichment scores & coerce to dataframe
# compute gene set enrichments
all_fgsea <- list(hallmark_fgsea = gsea_lister(rank_list = deg_ranks, sig_list = hallmark_gsigs, sig_name='HALLMARK'),
kegg_fgsea = gsea_lister(rank_list = deg_ranks, sig_list = kegg_gsigs, sig_name = 'KEGG'),
reactome_fgsea = gsea_lister(rank_list = deg_ranks, sig_list = reactome_gsigs, sig_name = 'REACTOME'),
biocarta_fgsea = gsea_lister(rank_list = deg_ranks, sig_list = biocarta_gsigs, sig_name = 'BIOCARTA')
)
rm(hallmark_gsigs, kegg_gsigs, reactome_gsigs, biocarta_gsigs)Select the top pathways, filter irrelevant terms, and order the pathways via hierarchical clustering.
# select top pathways & order pathways via hclust
data_list <- lapply(all_fgsea, function(x){
sig_name <- levels(as.factor(x$signature_name))
x$pathway <- gsub(paste0(sig_name, '_'), '', x=x$pathway)
x$sample_comparison <- factor(x$sample_comparison, levels=names(deg_ranks))
df <- x %>%
filter(padj < 0.05) %>% arrange(padj) %>% group_by(sample_comparison) %>%
top_n(10, wt=-padj) %>% top_n(10, wt=abs(NES)) %>%
arrange(abs(NES), -padj)
tmp <- df %>% dplyr::select(sample_comparison, pathway, NES) %>%
group_by(sample_comparison) %>% tidyr::spread(pathway, NES) %>%
as.data.frame()
names <- tmp$sample_comparison
tmp$sample_comparison <- NULL
tmp <- as.data.frame(lapply(tmp, function(x){
x[is.na(x)] <- 0
return(x) }))
rownames(tmp) <- names
order <- hclust( dist(t(tmp), method = "euclidean"), method = "ward.D" )$order
df$pathway <- factor(df$pathway,levels=colnames(tmp)[order])
df <- df[!is.na(df$pathway),]
df <- df[df$pathway != 'NA',]
rm(names, tmp)
return(df)
})
# define terms that are not relevant, and will be hidden in the resulting visualization
terms <- c('INFLUENZA', 'CELL_CYCLE', 'MITOTIC', 'S_PHASE', 'G1_S_TRANSITION', 'ASTHMA', 'AUTOIMMUNE_THYROID_DISEASE',
'INFECTION','DISEASE', 'INTESTINAL_IMMUNE_NETWORK_FOR_IGA_PRODUCTION','METABOLISM', 'APICAL_SURFACE','GENESIS',
'43S', 'SYNAP','NEUROTRANSMITTER', 'GABA', 'GLUTAMATE', 'POTASSIUM_CHANNELS', 'CHANNEL_TRANSPORT', 'CELL_LINEAGE',
'LUPUS', 'NEURONAL', 'RP_DEPENDENT_COTRANSLATIONAL_PROTEIN_TARGETING_TO_MEMBRANE', 'TRANSLATION', 'proteasome', 'complement', 'homologous_recombination',
"peptide_chain",'second_messenger','NONSENSE_MEDIATED_DECAY', 'G2M_checkpoint', 'G2_M_CHECKPOINT', 'IMMUNOREGULATORY_INTERACTIONS_BETWEEN',
'RIBOSOME', 'TIGHT_JUNCTION')
# apply filters
data_filt <- lapply(data_list, function(x){
y <- x
for (term in terms){
y <- y %>% filter(!grepl(term, x=pathway, ignore.case=TRUE))
}
df <- y
tmp <- df %>% dplyr::select(sample_comparison, pathway, NES) %>%
group_by(sample_comparison) %>% tidyr::spread(pathway, NES) %>%
as.data.frame()
names <- tmp$sample_comparison
tmp$sample_comparison <- NULL
tmp <- as.data.frame(lapply(tmp, function(x){
x[is.na(x)] <- 0
return(x) }))
rownames(tmp) <- names
order <- hclust( dist(t(tmp), method = "euclidean"), method = "ward.D" )$order
df$pathway <- factor(df$pathway,levels=colnames(tmp)[order])
df <- df[!is.na(df$pathway),]
df <- df[df$pathway != 'NA',]
rm(names, tmp)
return(df)
})Make one plot per gene set database and visualize as a set of stacked heatmaps.
# set up themes for ggplotting
theme <- theme(text = element_text(size=16),
axis.text.y = element_text(size=18, face='bold'),
axis.text.x = element_text(size=18, face='bold'),
axis.title = element_text(size=24, face = "bold"),
legend.text= element_text(size=18),
legend.title= element_text(size=24, face='bold'),
strip.text=element_text(size=24, face='bold'),
plot.title=element_text(size=28, face='bold'))
plot_list <- lapply(data_filt, function(x){
plot <- ggplot(data=x) +
aes(x=sample_comparison, y=pathway, fill=as.numeric(NES)) +
geom_tile(color='white', size=1) +
theme_classic() +
theme + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
scale_x_discrete(drop=FALSE) +
labs(x ='', y=levels(as.factor(x$signature_name))) +
scale_fill_gradient2(low = "lightseagreen", mid='white', high = "tomato", name="NES") +
guides(fill = guide_colourbar(barwidth = 1.5, barheight = 25))
return(plot)
})
plot_list[[4]] <- plot_list[[4]] + theme_classic() + theme + rotate_x_text(angle=45)
names(plot_list) <- c('HALLMARK', 'KEGG', 'REACTOME', 'BIOCARTA')
pathway_hm <- ggarrange(plotlist=plot_list, nrow=length(all_fgsea), align='v', common.legend=TRUE, legend='right', heights=c(1.2,0.6,0.5,1.6))
pathway_hmind$label[1:4] <- c('Recurrent_A', 'Recurrent_B', 'Recurrent_C', 'Primary')
ind$sample <- ind$label
ind[is.na(ind$sample),]$sample <- 'TCGA-GBM'
case_int_pcs <- ind
tcga_pca <- ggplot(data = case_int_pcs) +
aes(x=Dim.1, y=Dim.2, color=sample, label=label) +
geom_density2d(color="#7f7f7f", bins=50, alpha=0.7) +
geom_jitter(alpha=0.7, size=5) +
geom_label_repel(size=8, show.legend=FALSE, force=10, min.segment.length = 10, segment.size = 1) +
labs(x='PC-1: 14.3% of variance explained', y='PC-2: 9% of variance explained') +
theme_classic() + theme(plot.margin=margin(0.5,0.5,0.5,0.5, "in")) +
theme + scale_color_manual(values = cbpal)
tcga_pcaObjective: get a measure (a p-value) of whether the variance between my four samples is greater than if you were to take the variance four random TCGA samples. To do this, do resamplings of 10% of the total possible combinations of 4 tcga-GBM samples from a pool of 155 samples.
checkpoint.genes <- c('TIM3', 'HAVCR2', 'GMZA','CD208','LAMP3', 'CD28', 'CD3D', 'CD3G',
'CD3E', "PDCD1LG2", 'CD274', 'PDCD1LG1', 'PDL1', "FOXP3", 'CD20', 'MS4A1', 'CD68',
'CD34', 'CD38', 'CD68', 'LAMP4', 'SCARD1', 'GP110', 'CD4', 'PDCD1', 'CD274')
chk <- as.data.frame(t(batch.adj.v[rownames(batch.adj.v) %in% checkpoint.genes, ]))
chk$submitter_id <- rownames(chk)
chk$label <- chk$submitter_id
chk$label[!chk$label %in% c('Primary', "Recurrent_A", 'Recurrent_B', 'Recurrent_C') ] <- NA
# clean up gene names
colnames(chk)[1] <- 'PDL1' # 'PDCD1LG1'/CD274 is PDL1
colnames(chk)[5] <- 'CD3' # use CD3D expression only
colnames(chk)[10] <- 'TIM3' # HAVCR2
colnames(chk)[11] <- 'DCLAMP' # LAMP3
colnames(chk)[12] <- 'PD1' # PDCD1 is PD1
colnames(chk)[13] <- 'PDL2' # 'PDCD1LG2' is PDL2
val_cols <- c('PD1', 'PDL1', 'PDL2', 'TIM3', 'CD28', 'CD3', 'CD4', 'FOXP3', 'DCLAMP')
chk <- chk[, c(val_cols, 'submitter_id', 'label')]
chk$submitter_id <- as.character(chk$submitter_id)# separate out samples from case study
ref_df <- chk[1:4,]
ref_df <- data.frame(var=sapply(ref_df[, val_cols], var),
samp_type=as.character('test'),
val=val_cols)
set.seed(0)
length(combn(chk[- c(1:4),]$submitter_id, 4, simplify=FALSE))## [1] 20811575
boot_pool <- chk[- c(1:4),]
# grab all unique combinations of 4 from the pool of 155 TCGA samples
# and then only take 10% of that cause there's too many
combinations <- combn(boot_pool$submitter_id, 4, simplify=FALSE)
combinations <- base::sample(combinations, length(combinations)*0.1)# function to take variance of each combination of 4 for each gene/column
boot_fx <- function(x){
# grab the combination of 4
x <- boot_pool %>%
filter(submitter_id %in% x) %>%
dplyr::select(val_cols)
# take the variance for each gene
x <- data.frame(var=sapply(x, var),
val=val_cols,
samp_type=as.character('random_boot'))
return(x)
}
# do the resampling; using 3 cores (but the more the merrier!), and then bind the rows.
boot_pool <- bind_rows(pbmclapply(combinations, boot_fx, mc.cores=3))
all_boots <- bind_rows(ref_df, boot_pool)## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
# get a measure of whether the variance between my four samples is greater than
# if you were to take four random TCGA samples.
pvals <- lapply(levels(factor(all_boots$val)), function(i){
x <- all_boots %>% filter(val == i) %>% as.data.frame()
test_point <- x %>% filter(samp_type == "test") %>% dplyr::select(var) %>% deframe()
z <- stats::density(x$var, n=length(x$var))
xval <- as.numeric(as.matrix(as.numeric(z$x)))
yval <- as.numeric(as.matrix(as.numeric(z$y)))
# test right-sided
rpval <- auc(x=xval, y =yval, from=test_point, to=max(xval) )
# test left-sided
lpval <- auc(x=xval, y =yval, from=min(xval), to=test_point )
# pick the smallest of the two left-sided and right-sided tests
# to get the auc (p-value) for the closest end of the distribution
result <- min(c(rpval, lpval))
return(result)
})
names(pvals) <- levels(factor(all_boots$val))
pvals <- unlist(pvals)
pvaldf <- data.frame(pval=pvals,
gene=names(pvals))
pvaldf## # A tibble: 9 x 2
## pval gene
## <dbl> <fct>
## 1 0.249 CD28
## 2 0.413 CD3
## 3 0.313 CD4
## 4 0.0957 DCLAMP
## 5 0.0118 FOXP3
## 6 0.211 PD1
## 7 0.0409 PDL1
## 8 0.0828 PDL2
## 9 0.0882 TIM3
ind2 <- tidyr::gather(chk, gene, cpm, val_cols)
ind2$gene <- as.factor(ind2$gene)
ind2$cpm <- as.numeric(ind2$cpm)
ind2 <- ind2 %>% inner_join(pvaldf, by=c('gene'))
ind2$sample <- as.character(ind2$label)
ind2$sample[is.na(ind2$sample)] <- 'TCGA-GBM'
ind2$sample <- as.factor(ind2$sample)
checkpoint_violins <- ggplot(data=ind2) +
aes(y=cpm, x=gene, color=sample) +
geom_violin(alpha=0.05, aes(color=NULL), fill="#7f7f7f", show.legend = FALSE) +
geom_boxplot(width=0.1, aes(color=NULL, fill=NULL), show.legend = FALSE ) +
geom_beeswarm(data=ind2[ind2$sample =='TCGA-GBM',],cex=0.8,alpha=0.4, size=2, show.legend=FALSE) +
geom_point(data=ind2[ind2$sample !='TCGA-GBM',], size=6, alpha=0.8) +
theme_classic() + theme + labs(y='Log2(CPM)', x='', color='Sample') +
geom_text(data=pvaldf[pvaldf$pval<0.05,], y=-5,size=6, aes(x=gene, label=paste('P =',round(pval, digits=4)), color=NULL), show.legend=FALSE ) +
scale_color_manual(values = cbpal, aesthetics = 'color')
checkpoint_violins## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] pbmcapply_1.5.0 MESS_0.5.5
## [3] geeM_0.10.1 Matrix_1.2-17
## [5] geepack_1.2-1 ggbeeswarm_0.6.0
## [7] fgsea_1.8.0 Rcpp_1.0.2
## [9] ggpubr_0.2.3 magrittr_1.5
## [11] org.Hs.eg.db_3.7.0 AnnotationDbi_1.44.0
## [13] ggrepel_0.8.1 combinat_0.0-8
## [15] forcats_0.4.0 stringr_1.4.0
## [17] dplyr_0.8.3 purrr_0.3.2
## [19] readr_1.3.1 tidyr_0.8.3
## [21] tibble_2.1.3 ggplot2_3.2.1
## [23] tidyverse_1.2.1 edgeR_3.24.3
## [25] limma_3.38.3 DESeq2_1.22.2
## [27] SummarizedExperiment_1.12.0 DelayedArray_0.8.0
## [29] BiocParallel_1.16.6 matrixStats_0.55.0
## [31] Biobase_2.42.0 GenomicRanges_1.34.0
## [33] GenomeInfoDb_1.18.2 IRanges_2.16.0
## [35] S4Vectors_0.20.1 BiocGenerics_0.28.0
##
## loaded via a namespace (and not attached):
## [1] TH.data_1.0-10 colorspace_1.4-1 ggsignif_0.6.0
## [4] estimability_1.3 htmlTable_1.13.1 XVector_0.22.0
## [7] base64enc_0.1-3 rstudioapi_0.10 bit64_0.9-7
## [10] fansi_0.4.0 mvtnorm_1.0-11 lubridate_1.7.4
## [13] xml2_1.2.2 codetools_0.2-16 splines_3.5.2
## [16] geneplotter_1.60.0 knitr_1.24 zeallot_0.1.0
## [19] Formula_1.2-3 jsonlite_1.6 broom_0.5.2
## [22] annotate_1.60.1 cluster_2.1.0 compiler_3.5.2
## [25] httr_1.4.1 emmeans_1.4 lsmeans_2.30-0
## [28] backports_1.1.4 assertthat_0.2.1 lazyeval_0.2.2
## [31] cli_1.1.0 acepack_1.4.1 htmltools_0.3.6
## [34] tools_3.5.2 gtable_0.3.0 glue_1.3.1
## [37] GenomeInfoDbData_1.2.0 fastmatch_1.1-0 cellranger_1.1.0
## [40] vctrs_0.2.0 nlme_3.1-141 xfun_0.9
## [43] rvest_0.3.4 qusage_2.16.1 XML_3.98-1.20
## [46] MASS_7.3-51.4 zoo_1.8-6 zlibbioc_1.28.0
## [49] scales_1.0.0 hms_0.5.1 sandwich_2.5-1
## [52] RColorBrewer_1.1-2 yaml_2.2.0 memoise_1.1.0
## [55] gridExtra_2.3 rpart_4.1-15 latticeExtra_0.6-28
## [58] stringi_1.4.3 RSQLite_2.1.2 genefilter_1.64.0
## [61] checkmate_1.9.4 rlang_0.4.0 pkgconfig_2.0.2
## [64] bitops_1.0-6 evaluate_0.14 lattice_0.20-38
## [67] labeling_0.3 htmlwidgets_1.3 cowplot_1.0.0
## [70] bit_1.1-14 tidyselect_0.2.5 R6_2.4.0
## [73] generics_0.0.2 Hmisc_4.2-0 multcomp_1.4-10
## [76] DBI_1.0.0 pillar_1.4.2 haven_2.1.1
## [79] foreign_0.8-72 withr_2.1.2 survival_2.44-1.1
## [82] RCurl_1.95-4.12 nnet_7.3-12 modelr_0.1.5
## [85] crayon_1.3.4 utf8_1.1.4 rmarkdown_1.15
## [88] locfit_1.5-9.1 grid_3.5.2 readxl_1.3.1
## [91] data.table_1.12.2 blob_1.2.0 digest_0.6.20
## [94] xtable_1.8-4 munsell_0.5.0 beeswarm_0.2.3
## [97] vipor_0.4.5